This is an R Markdown document to accompany a final project paper for G588: Applied Spatial Statistics instructed by Dr. Scott Robeson.
In 2018, Microsoft released a computer-generated buildings footprints GIS layer for the US as open data. This document demonstrates some of the methods us in R to explore this dataset. For additional context refer to the accompanying paper.
A brief look at the largest building in Indiana. The largest “building” identified through the algorithm appears to be a series of buildings of a rail loading area of US Steelworks in Gary:
setwd(‘D:/G588’) The second largest building in Indiana according to the datasets is the Toyota Plant in near Princeton.
It is 328924.2449 square meters which is over 81 acres:
shape <- readOGR(dsn = ".", layer = "footprints_plus_census2000")
## OGR data source with driver: ESRI Shapefile
## Source: "D:\G588", layer: "footprints_plus_census2000"
## with 92 features
## It has 29 fields
## Integer64 fields read as strings: NCAPC_1 CNTY_FIPS STFID POP2000 AGE_65_UP HOUSEHOLDS FAMILIES HSE_UNITS VACANT OWNER_OCC RENTER_OCC FREQUENCY COUNT_OBJE
dat<-shape@data
ufdat<-unfactor(dat)
#bxp<-boxplot(dat$SUM_SHAPE_, horizontal=TRUE, axes=FALSE)
#mtext(c("Min","Max"), at=bxp$stats[c(1,5)], line=-3)
#plot(dat[20:29])
#plot(dat$HOUSEHOLDS,dat$FREQUENCY)
#par(mfrow=c(2,3),pty='s')
summary(ufdat$HOUSEHOLDS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2201 8028 12642 25395 25149 352164
hist(ufdat$FREQUENCY, probability=T)
br<-(ufdat$POP2000) / (ufdat$FREQUENCY)
hist((ufdat$POP2000) / (ufdat$FREQUENCY))
plot(br,ufdat$POP2000,)
text(br,ufdat$POP2000,labels=ufdat$NAME_U)
sums<-(ufdat$POP2000/ ufdat$FREQUENCY)
hist(sums,
breaks=20,
main='Persons per building Indiana Counties',
xlab='Persons per building',
ylab='# Counties')
lines(density(sums), col='red')
#sums is buildings per person
plot(sums,ufdat$POP2000,)
text(sums,ufdat$POP2000,labels=dat$NAME_U,)
plot((ufdat$POP2000/ ufdat$FREQUENCY), ufdat$POP2000, main="Persons per building", sub="Indiana Counties",
xlab="buildings per capita", ylab="population",
xlim=c(1, 2.8), ylim=c(1, 1000000))
text(sums,ufdat$POP2000,labels=dat$NAME_U,)
sums
## [1] 1.304095 1.219358 2.075708 2.282572 2.322655 2.071341 1.883506 1.571852
## [9] 1.576822 1.602102 1.541634 1.412146 1.359920 2.120671 1.564495 1.362454
## [17] 1.367218 1.249432 1.733095 1.920458 1.554589 1.564681 1.523683 1.243700
## [25] 1.666260 1.270361 1.240923 2.105893 1.288162 1.560542 1.909604 2.475158
## [33] 1.203574 1.620228 1.355881 1.951736 1.813630 1.434484 1.326411 1.676621
## [41] 1.597224 1.471891 1.594151 1.525841 1.807245 1.530307 1.349272 2.615680
## [49] 1.601529 1.573913 1.644896 1.213759 1.069101 1.544107 1.872039 1.668309
## [57] 2.258765 1.437558 1.195929 1.358653 1.277177 1.763000 2.374877 1.166875
## [65] 1.176082 1.645986 1.628435 1.440657 1.352409 1.476145 1.195366 1.571111
## [73] 1.010028 1.531717 1.904522 1.258221 1.317662 1.502519 1.241869 1.359961
## [81] 1.762335 1.422855 1.659433 1.407878 1.158275 1.211613 2.017060 1.441571
## [89] 1.787877 1.511366 1.275314 2.142918
hist(unfactor(dat$POP2000))
plot(unfactor(dat$POP2000),unfactor(dat$FREQUENCY))
text(unfactor(dat$POP2000),unfactor(dat$FREQUENCY),labels=dat$NAME_U)
qqnorm(dat$AVE_HH_SZ,main="Q-Q Plot of HH SIZE",pch=19)
qqline(dat$AVE_HH_SZ)
qqnorm(dat$MEAN_SHAPE,main="Q-Q Plot of Building Size",pch=19)
qqline(dat$MEAN_SHAPE)
#why is mean building size so different
hist(dat$MEAN_SHAPE)
lines(ufdat)
## Warning in data.matrix(x): NAs introduced by coercion
## Warning in data.matrix(x): NAs introduced by coercion
## Warning in data.matrix(x): NAs introduced by coercion
qqline(dat$MEAN_SHAPE)
summary(dat$MEAN_SHAPE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 168.4 218.1 239.9 238.0 254.5 326.3
hist(dat$AVE_HH_SZ)
plot(dat$AREA,dat$SUM_SHAPE)
text(dat$AREA,dat$SUM_SHAPE, labels=dat$NAME_U)
plot(dat$AREA,dat$SUM_SHAPE, main="Total area and Total area with buildings", sub="Indiana Counties",
xlab="Total Area of County", ylab="Total Area with Buildings"
)
text(dat$AREA,dat$SUM_SHAPE,labels=dat$NAME_U,)
#square meters under roof divided by population gives us a range of 100 meters to 220 meters under roof per person
#suprisingligly Marion and Lake counties are near bottom at 100 m2 and less populous areas have higher density per capita?
#density and height(# of stories) not accounted for in footprints
plot(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000)
text(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000, labels=dat$NAME_U)
#percentage of county that has buildings
plot(ufdat$SUM_SHAPE/ufdat$AREA,ufdat$POP2000, xlab="Percentage of total area of a county covered by buildings", ylab="pop", main="Percentage of total area of a county covered by buildings")
text(ufdat$SUM_SHAPE/ufdat$AREA,ufdat$POP2000, labels=dat$NAME_U, )
summary(ufdat$SUM_SHAPE/ufdat$AREA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001599 0.003711 0.005092 0.008528 0.008308 0.088437
plot(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000, main="Square meters of building footprints per capita", sub="Indiana Counties",
xlab="Square meters of building footprints per capita", ylab="Population"
)
text(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000,labels=dat$NAME_U,)
plot(ufdat$SUM_SHAPE/ufdat$AREA, ufdat$AREA)
text(ufdat$SUM_SHAPE/ufdat$AREA, ufdat$AREA, labels=ufdat$NAME_U)
#percentage of county that has buildings
summary(ufdat$SUM_SHAPE/ufdat$AREA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001599 0.003711 0.005092 0.008528 0.008308 0.088437
ufdat$SUM_SHAPE/ufdat$AREA
## [1] 0.007050724 0.007425515 0.023720053 0.023545142 0.030260151 0.013174022
## [7] 0.008944985 0.007338194 0.006620017 0.006196689 0.008408619 0.004145028
## [13] 0.006409588 0.025016424 0.003652034 0.002476523 0.003525295 0.002268585
## [19] 0.004931510 0.005180272 0.005433080 0.006874246 0.004958680 0.003432774
## [25] 0.005225933 0.001750595 0.003838459 0.008273882 0.004401003 0.004726856
## [31] 0.014670236 0.013212550 0.001599072 0.004639839 0.004351759 0.013802850
## [37] 0.014078712 0.002452151 0.004072386 0.031472408 0.004572002 0.008168108
## [43] 0.003312692 0.006795101 0.009194889 0.011490726 0.002235702 0.088436832
## [49] 0.019167459 0.004163502 0.006285973 0.003271435 0.003088848 0.006317266
## [55] 0.019873244 0.008018453 0.011222922 0.004016710 0.003730747 0.002802569
## [61] 0.005003585 0.009842538 0.011927105 0.002632572 0.004358224 0.007810856
## [67] 0.002460359 0.004012151 0.003376408 0.005212962 0.004053974 0.005214051
## [73] 0.002997272 0.005226974 0.003834771 0.002517078 0.004997292 0.006384830
## [79] 0.003402068 0.002961370 0.013766366 0.002083394 0.003893220 0.007720234
## [85] 0.001981555 0.004521076 0.021397327 0.002820532 0.007182648 0.004048591
## [91] 0.003789111 0.033423764
plot(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000)
text(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000, labels=dat$NAME_U)
text(dat$AVE_HH_SZ,dat$MEAN_SHAPE, labels=dat$NAME_U)
dat.numeric<-shape@data[,sapply(shape@data, is.numeric)]